Objectives: 1) Learn the basics of dplyr -Selecting, Filtering, and Arranging -Making a series of function with pipes %>% -Mutate new and existing columns -Wide and long tables 2) Learn the basics of ggplot2 -Setting up your dataframe for easy plotting -Plot lines, scatter plots, boxplots -Compare different subpopulations

Installing and Loading packages

#Comment out if you've already installed them
#install.packages("dplyr")
#install.packages("tidyr")
#install.packages("ggplot2")

#Load libraries
library(dplyr)
## Warning: package 'dplyr' was built under R version 3.6.2
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(tidyr)
## Warning: package 'tidyr' was built under R version 3.6.2
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 3.6.2

Loading your dataset

#setwd("YOUR/PATH/HERE")
PKdata <- read.csv("simtab.csv")

#Let's take a glimpse at your dataset
head(PKdata)
##   ID PT_BIRTHDATE PT_USUBJID               PT_ADDRESS TIME   IPRED      DV
## 1  1   09/20/1982    XDP5409 7385 San Francisco Drive    0 0.00000 0.00000
## 2  1   09/20/1982    XDP5409 7385 San Francisco Drive    1 0.35767 0.36910
## 3  1   09/20/1982    XDP5409 7385 San Francisco Drive    2 0.45474 0.49766
## 4  1   09/20/1982    XDP5409 7385 San Francisco Drive    4 0.43277 0.55543
## 5  1   09/20/1982    XDP5409 7385 San Francisco Drive    8 0.29634 0.28532
## 6  1   09/20/1982    XDP5409 7385 San Francisco Drive   12 0.19763 0.18520
##   AMT HT     BW SEX HIV IWRES CWRES    PRED        RES WRES DOSE
## 1 300 69 78.298   1   0    NA     0      NA  0.0000000    0  300
## 2   0 69 78.298   1   0   -10     0 0.36325  0.0058536    0  300
## 3   0 69 78.298   1   0   -10     0 0.47078  0.0268820    0  300
## 4   0 69 78.298   1   0   -10     0 0.46926  0.0861620    0  300
## 5   0 69 78.298   1   0   -10     0 0.35684 -0.0715120    0  300
## 6   0 69 78.298   1   0   -10     0 0.26497 -0.0797670    0  300
#Alternatively, click on PKdata in your Global Environment toolbar on the right

#Let's open up the data dictionary too, it's always good practice to know your dataset

Dplyr Cheatsheet https://raw.githubusercontent.com/rstudio/cheatsheets/master/data-transformation.pdf

Selecting

#You can select specific columns using select()
#Let's de-identify our dataset by selecting the columns we want
deidentified <- select(PKdata, ID, TIME, DV, DOSE, HT, BW, SEX, HIV)
head(deidentified)
##   ID TIME      DV DOSE HT     BW SEX HIV
## 1  1    0 0.00000  300 69 78.298   1   0
## 2  1    1 0.36910  300 69 78.298   1   0
## 3  1    2 0.49766  300 69 78.298   1   0
## 4  1    4 0.55543  300 69 78.298   1   0
## 5  1    8 0.28532  300 69 78.298   1   0
## 6  1   12 0.18520  300 69 78.298   1   0
#You can also do this by "subtracting" the columns you don't want with -
deidentified <- select(PKdata, -PT_BIRTHDATE, -PT_ADDRESS, -PT_USUBJID)
head(deidentified)
##   ID TIME   IPRED      DV AMT HT     BW SEX HIV IWRES CWRES    PRED
## 1  1    0 0.00000 0.00000 300 69 78.298   1   0    NA     0      NA
## 2  1    1 0.35767 0.36910   0 69 78.298   1   0   -10     0 0.36325
## 3  1    2 0.45474 0.49766   0 69 78.298   1   0   -10     0 0.47078
## 4  1    4 0.43277 0.55543   0 69 78.298   1   0   -10     0 0.46926
## 5  1    8 0.29634 0.28532   0 69 78.298   1   0   -10     0 0.35684
## 6  1   12 0.19763 0.18520   0 69 78.298   1   0   -10     0 0.26497
##          RES WRES DOSE
## 1  0.0000000    0  300
## 2  0.0058536    0  300
## 3  0.0268820    0  300
## 4  0.0861620    0  300
## 5 -0.0715120    0  300
## 6 -0.0797670    0  300
#You can also search through your columns with helpers: contains(), starts_with(), ends_with()
What_did_this_do <- select(PKdata, ends_with("V"))
head(What_did_this_do)
##        DV HIV
## 1 0.00000   0
## 2 0.36910   0
## 3 0.49766   0
## 4 0.55543   0
## 5 0.28532   0
## 6 0.18520   0
#Write your answer commented out below:
#removed all columns that end with V

#Can you use helpers to create the same deidentified dataset?
deidentified <- select(PKdata, -starts_with("PT"))
head(deidentified)
##   ID TIME   IPRED      DV AMT HT     BW SEX HIV IWRES CWRES    PRED
## 1  1    0 0.00000 0.00000 300 69 78.298   1   0    NA     0      NA
## 2  1    1 0.35767 0.36910   0 69 78.298   1   0   -10     0 0.36325
## 3  1    2 0.45474 0.49766   0 69 78.298   1   0   -10     0 0.47078
## 4  1    4 0.43277 0.55543   0 69 78.298   1   0   -10     0 0.46926
## 5  1    8 0.29634 0.28532   0 69 78.298   1   0   -10     0 0.35684
## 6  1   12 0.19763 0.18520   0 69 78.298   1   0   -10     0 0.26497
##          RES WRES DOSE
## 1  0.0000000    0  300
## 2  0.0058536    0  300
## 3  0.0268820    0  300
## 4  0.0861620    0  300
## 5 -0.0715120    0  300
## 6 -0.0797670    0  300

Filtering

#You can filter out specific rows using filter()
#For example, selecting all patients who received a specific dose
Dosed_600 <- filter(PKdata, DOSE == 600)
head(Dosed_600)
##    ID PT_BIRTHDATE PT_USUBJID               PT_ADDRESS TIME   IPRED
## 1 301   09/20/1982    XDP5607 1296 San Francisco Drive    0 0.00000
## 2 301   09/20/1982    XDP5607 1296 San Francisco Drive    1 0.73153
## 3 301   09/20/1982    XDP5607 1296 San Francisco Drive    2 0.95626
## 4 301   09/20/1982    XDP5607 1296 San Francisco Drive    4 0.97317
## 5 301   09/20/1982    XDP5607 1296 San Francisco Drive    8 0.77547
## 6 301   09/20/1982    XDP5607 1296 San Francisco Drive   12 0.60406
##        DV AMT HT     BW SEX HIV IWRES CWRES    PRED      RES WRES DOSE
## 1 0.00000 600 70 64.202   1   0    NA     0      NA 0.000000    0  600
## 2 0.75396   0 70 64.202   1   0   -10     0 0.72684 0.027128    0  600
## 3 1.08000   0 70 64.202   1   0   -10     0 0.94255 0.137470    0  600
## 4 1.00730   0 70 64.202   1   0   -10     0 0.94086 0.066440    0  600
## 5 0.73731   0 70 64.202   1   0   -10     0 0.71775 0.019561    0  600
## 6 0.60851   0 70 64.202   1   0   -10     0 0.53471 0.073796    0  600
#You can use any logical function in R to designate your filter criteria
# > greater than, <= greater equal than
# < less than, <= lesser equal than
# ! not
# & and
# | or
#Can you filter for patients with body weight greater or equal than 45kg?
WT.GT.45 <- filter(PKdata, BW >= 45)
head(WT.GT.45)
##   ID PT_BIRTHDATE PT_USUBJID               PT_ADDRESS TIME   IPRED      DV
## 1  1   09/20/1982    XDP5409 7385 San Francisco Drive    0 0.00000 0.00000
## 2  1   09/20/1982    XDP5409 7385 San Francisco Drive    1 0.35767 0.36910
## 3  1   09/20/1982    XDP5409 7385 San Francisco Drive    2 0.45474 0.49766
## 4  1   09/20/1982    XDP5409 7385 San Francisco Drive    4 0.43277 0.55543
## 5  1   09/20/1982    XDP5409 7385 San Francisco Drive    8 0.29634 0.28532
## 6  1   09/20/1982    XDP5409 7385 San Francisco Drive   12 0.19763 0.18520
##   AMT HT     BW SEX HIV IWRES CWRES    PRED        RES WRES DOSE
## 1 300 69 78.298   1   0    NA     0      NA  0.0000000    0  300
## 2   0 69 78.298   1   0   -10     0 0.36325  0.0058536    0  300
## 3   0 69 78.298   1   0   -10     0 0.47078  0.0268820    0  300
## 4   0 69 78.298   1   0   -10     0 0.46926  0.0861620    0  300
## 5   0 69 78.298   1   0   -10     0 0.35684 -0.0715120    0  300
## 6   0 69 78.298   1   0   -10     0 0.26497 -0.0797670    0  300
#You can also use & or | to create more complex logical functions
#Let's filter for patients that received a dose greater than or equal to 600 and HIV+
Dosed600up_HIV <- filter(PKdata, DOSE >= 600 & HIV == 1)
head(Dosed600up_HIV)
##    ID PT_BIRTHDATE PT_USUBJID               PT_ADDRESS TIME   IPRED
## 1 315   09/20/1982    XDP4217 7593 San Francisco Drive    0 0.00000
## 2 315   09/20/1982    XDP4217 7593 San Francisco Drive    1 0.73035
## 3 315   09/20/1982    XDP4217 7593 San Francisco Drive    2 0.95281
## 4 315   09/20/1982    XDP4217 7593 San Francisco Drive    4 0.96497
## 5 315   09/20/1982    XDP4217 7593 San Francisco Drive    8 0.76058
## 6 315   09/20/1982    XDP4217 7593 San Francisco Drive   12 0.58588
##        DV AMT HT     BW SEX HIV IWRES CWRES    PRED         RES WRES DOSE
## 1 0.00000 600 64 65.279   1   1    NA     0      NA  0.00000000    0  600
## 2 0.73102   0 64 65.279   1   1   -10     0 0.71913  0.01188800    0  600
## 3 0.91986   0 64 65.279   1   1   -10     0 0.92031 -0.00044695    0  600
## 4 1.05310   0 64 65.279   1   1   -10     0 0.88979  0.16336000    0  600
## 5 0.84030   0 64 65.279   1   1   -10     0 0.63150  0.20880000    0  600
## 6 0.54619   0 64 65.279   1   1   -10     0 0.43691  0.10928000    0  600
#Challenge: Filter for patients who received the lowest or highest dose, and is female with HIV or male
#Remember there are always many ways to code something :)
Challenge <- filter(PKdata, DOSE != 600 & (HIV == 1 | SEX == 1))
head(Challenge)
##   ID PT_BIRTHDATE PT_USUBJID               PT_ADDRESS TIME   IPRED      DV
## 1  1   09/20/1982    XDP5409 7385 San Francisco Drive    0 0.00000 0.00000
## 2  1   09/20/1982    XDP5409 7385 San Francisco Drive    1 0.35767 0.36910
## 3  1   09/20/1982    XDP5409 7385 San Francisco Drive    2 0.45474 0.49766
## 4  1   09/20/1982    XDP5409 7385 San Francisco Drive    4 0.43277 0.55543
## 5  1   09/20/1982    XDP5409 7385 San Francisco Drive    8 0.29634 0.28532
## 6  1   09/20/1982    XDP5409 7385 San Francisco Drive   12 0.19763 0.18520
##   AMT HT     BW SEX HIV IWRES CWRES    PRED        RES WRES DOSE
## 1 300 69 78.298   1   0    NA     0      NA  0.0000000    0  300
## 2   0 69 78.298   1   0   -10     0 0.36325  0.0058536    0  300
## 3   0 69 78.298   1   0   -10     0 0.47078  0.0268820    0  300
## 4   0 69 78.298   1   0   -10     0 0.46926  0.0861620    0  300
## 5   0 69 78.298   1   0   -10     0 0.35684 -0.0715120    0  300
## 6   0 69 78.298   1   0   -10     0 0.26497 -0.0797670    0  300

Arranging

#arrange() will reorder rows or columns according to your specifications (default from low to high)
#For example in PK data we will often organize our rows by patient ID, then by time.
PKdata <- arrange(PKdata,ID, TIME)

Putting it all together with Pipes %>%

#Pipes are used to connect the output of one function to the input of another
#Let's deidentify our dataset, filter for 600mg dose, and arrange by id and time all together
Dose_600 <- PKdata %>%
  select( -starts_with("PT")) %>%
  filter(DOSE == 600) %>%
  arrange(ID, TIME)

#Try this one for yourself!
#Let's try filtering for HIV+ or body weight less than 45kg, then...
#we want to see the trough levels of each patient (defined as the 24hr timepoint)  

Trough <-   PKdata %>%
  select( -starts_with("PT")) %>%
  filter(HIV == 1 | BW < 45) %>%
  filter(TIME == 24)

View(Trough)

Mutate

#mutate() can create new columns or alter existing ones
#For example, BMI is often calculated from HT and WT
#BMI = weight(kg)/height(m)^2

PKdata <- PKdata %>%
  mutate(BMI = BW/(HT/39.37)^2) #are the units correct? check the data dictionary!

#Now you try making a new column and converting DV from mg/L into micromoles!
#The drug used here is remdesivir

PKdata<- PKdata %>%
  mutate(Conc_uM = DV/1000/602.6*1000000) #mg/L *1g/1000mg * 1mol/602.6g * 1000000uM/1M

Our first Plot!! ggplot2 cheatsheet: https://raw.githubusercontent.com/rstudio/cheatsheets/master/data-visualization-2.1.pdf

ggplot(PKdata) + 
  geom_point(aes(x=TIME, y=DV)) ##What are we trying to plot with a PK profile?

#Do you like lines better?
ggplot(PKdata) + 
 geom_line(aes(x=TIME, y=DV,group=factor(ID)),alpha = 0.1, linetype=2,size=0.1)

#Let's put them on the same graph with a +

Graphing Subpopulations

ggplot(PKdata) + 
  geom_point(aes(x=TIME, y=DV, color = as.factor(DOSE))) + 
  geom_line(aes(x=TIME, y=DV, group=factor(ID), color = as.factor(DOSE)),alpha = 0.1, linetype=2,size=0.1)

Another way to stratify

#It's still a little too cluttered
ggplot(PKdata) + 
  geom_point(aes(x=TIME, y=DV, color = as.factor(DOSE))) + 
  geom_line(aes(x=TIME, y=DV, group=factor(ID), color = as.factor(DOSE)),alpha = 0.1, linetype=2,size=0.1) +
  facet_wrap(.~DOSE)

Let’s try some on your own

#Copy and paste from above, and let's plot stratified by HIV status
ggplot(PKdata) + 
  geom_point(aes(x=TIME, y=DV, color = as.factor(HIV))) +
  geom_line(aes(x=TIME, y=DV, group=factor(ID), color = as.factor(HIV)),alpha = 0.1, linetype=2,size=0.1) +
  facet_wrap(.~DOSE)

#Now let's try adding 2 stratifications: color by HIV and facet_wrap by Dose~SEX. Then try the other way around!
ggplot(PKdata) + 
  geom_point(aes(x=TIME, y=DV, color = as.factor(HIV))) + 
  geom_line(aes(x=TIME, y=DV, group=factor(ID), color = as.factor(HIV)),alpha = 0.1, linetype=2,size=0.1) +
  facet_wrap(DOSE~SEX)

ggplot(PKdata) + 
  geom_point(aes(x=TIME, y=DV, color = as.factor(SEX))) +
  geom_line(aes(x=TIME, y=DV, group=factor(ID), color = as.factor(SEX)),alpha = 0.1, linetype=2,size=0.1) +
  facet_wrap(DOSE~~HIV)

#Challenge: Plot stratified by body weight, one color for above 45kg and another color for below. Make sure to have a legend :)
#Hint: you will need to manipulate your dataframe

PKdata <- PKdata %>%
  mutate(WT_FLAG = ifelse(BW > 45, 1, 0))

ggplot(PKdata) + 
  geom_point(aes(x=TIME, y=DV, color = as.factor(WT_FLAG))) + 
  geom_line(aes(x=TIME, y=DV, group=factor(ID), color = as.factor(WT_FLAG)),alpha = 0.1, linetype=2,size=0.1) +
  facet_wrap(.~DOSE)

Summarise Dplyr can be used for simple summary statistics

#You can use summarise to calculate means, ranges, standard deviations, quartiles, etc.
Summary_stats <- PKdata %>%
  summarise(c_mean = mean(DV),
            c_stdev = sd(DV),
            BW_mean = mean(BW),
            BW_upper = quantile(BW, probs = 0.975),
            BW_lower = quantile(BW, probs = 0.225))

View(Summary_stats)

Combined with group_by(), summarise is a force to be reckoned with..

#Let's calculate subpopulation summary statistics
#For example, let's calculate body weight summary statistics by SEX
BW_by_SEX <- PKdata %>%
  group_by(SEX) %>%
  summarise(BW_mean = mean(BW),
            BW_upper = quantile(BW, probs = 0.975),
            BW_lower = quantile(BW, probs = 0.225))
## `summarise()` ungrouping output (override with `.groups` argument)
#Let's plot it!
ggplot(BW_by_SEX) +
  geom_point(aes(x=SEX, y=BW_mean)) +
  geom_errorbar((aes(x=SEX, ymin=BW_lower, ymax=BW_upper)))

#An easier way to do it in just ggplot, there are always multiple ways to do the same thing
ggplot(PKdata) +
  geom_boxplot(aes(x=as.factor(SEX), y=BW))

Let’s calculate the median PK profile

#Let's start with calculating median and percentile profiles 
Med <- PKdata %>%
  group_by(TIME) %>%
  summarise(median = median(DV),
            upper = quantile(DV, probs=0.975), #Upper 95th percentile
            lower = quantile(DV, probs=0.225)) #Lower 95th percentile
## `summarise()` ungrouping output (override with `.groups` argument)
#Let's plot it
ggplot(Med) +
  geom_line(aes(x=TIME, y=median)) +
  geom_ribbon(aes(x=TIME, ymin=lower, ymax=upper), alpha=0.3)

Let’s compare median PK profiles of subpopulations

#Let's do the same thing but subset by DOSE
DOSE_Med <- PKdata %>%
  group_by(DOSE,TIME) %>%
  summarise(median = median(DV),
            upper = quantile(DV, probs=0.975), #Upper 95th percentile
            lower = quantile(DV, probs=0.225)) #Lower 95th percentile
## `summarise()` regrouping output by 'DOSE' (override with `.groups` argument)
#Let's plot it
ggplot(DOSE_Med) +
  geom_line(aes(x=TIME, y=median, color = as.factor(DOSE))) +
  geom_ribbon(aes(x=TIME, ymin=lower, ymax=upper, fill = as.factor(DOSE)), alpha=0.3)

Let’s try another one…

#Median PK profiles, subset by HIV

HIV_Med <- PKdata %>%
  group_by(HIV,TIME) %>%
  summarise(median = median(DV),
            upper = quantile(DV, probs=0.975), #Upper 95th percentile
            lower = quantile(DV, probs=0.225)) #Lower 95th percentile
## `summarise()` regrouping output by 'HIV' (override with `.groups` argument)
#Let's plot it
ggplot(HIV_Med) +
  geom_line(aes(x=TIME, y=median, color = as.factor(HIV))) +
  geom_ribbon(aes(x=TIME, ymin=lower, ymax=upper, fill = as.factor(HIV)), alpha=0.3)

This doesn’t really work…the different dosing levels make the stratification cluttered

#We need to subset by DOSE and HIV
#Prepare the dataset yourself

HIV_Med <- PKdata %>%
  group_by(DOSE, HIV,TIME) %>%
  summarise(median = median(DV),
            upper = quantile(DV, probs=0.975), #Upper 95th percentile
            lower = quantile(DV, probs=0.225)) #Lower 95th percentile
## `summarise()` regrouping output by 'DOSE', 'HIV' (override with `.groups` argument)
#Let's plot it
ggplot(HIV_Med) +
  geom_line(aes(x=TIME, y=median, color = as.factor(HIV))) +
  geom_ribbon(aes(x=TIME, ymin=lower, ymax=upper, fill = as.factor(HIV)), alpha=0.3) +
  facet_wrap(.~DOSE)